This notebook main goal is to :
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown
# transparentai package : https://github.com/Nathanlauga/transparentai
from transparentai.datasets import ClassificationDataset
import transparentai.explore as explore
PROJECT_PATH = '..'
df_details = pd.read_csv(f'{PROJECT_PATH}/01_collect/columns_informations.csv')
df_details
dfs = {}
datasets = df_details['dataset'].unique()
for dataset in datasets:
dataset_detail = df_details[df_details['dataset'] == dataset]
dtypes, columns = dataset_detail['dtype'], dataset_detail['column']
dtypes = pd.Series(dtypes.values,index=columns)
parse_dates = list()
for var, dtype in dtypes.iteritems():
if 'datetime' in dtype:
parse_dates.append(var)
dtypes = dtypes.str.replace(r'datetime.*','str')
dfs[dataset] = pd.read_csv(f'{PROJECT_PATH}/_data/{dataset}.csv',
dtype=dtypes.to_dict(),
parse_dates=parse_dates
)
columns = np.where(dataset_detail['new_name'].isna(), columns, dataset_detail['new_name'])
dfs[dataset].columns = columns
for dataset in dfs:
display(Markdown(f'#### {dataset}, {dfs[dataset].shape}'))
display(dfs[dataset].head())
Here we just have one dataset adult, but if in the next versions we had some others datasets it's important to set them into distinct variables.
adult = dfs['adult']
del dfs
help(explore.show_missing_values)
display(Markdown('#### Missing values for adult dataset'))
explore.show_missing_values(adult)
from transparentai.utils import remove_var_with_one_value
adult = remove_var_with_one_value(adult)
help(explore.show_df_vars)
explore.show_df_vars(df=adult, target='income')
help(explore.show_df_numerical_relations)
explore.show_df_numerical_relations(df=adult, target='income')
help(explore.show_df_num_cat_relations)
explore.show_df_num_cat_relations(df=adult, target='income')
help(explore.show_df_correlations)
explore.show_df_correlations(df=adult)
Now let's take a look on dataset bias.
protected_vars = df_details[df_details['is_protected'] == 1]
protected_vars = np.where(protected_vars['new_name'].isna(), protected_vars['column'], protected_vars['new_name'])
for var in protected_vars:
display(Markdown(f'#### {var}'))
display(adult[var].unique())
Convert age into a categorical variable for this purpose with the following rule :
YoungAdultElderadult['age category'] = np.where(adult['age'] < 26, 'Young',
np.where(adult['age'] < 61, 'Adult','Elder'))
target = 'income'
privileged_values = {
'age category': ['Adult'],
'marital-status': ['Married-civ-spouse','Married-AF-spouse'],
'race': ['White'],
'gender': ['Male']
}
fair_dataset = ClassificationDataset(df=adult,
label_name=target,
privileged_values=privileged_values)
fair_dataset.show_bias_metrics(label_value='>50K')
from ipywidgets import interact
import ipywidgets as widgets
attrs = list(fair_dataset.protected_attributes.keys())
labels = fair_dataset.df[fair_dataset.label_name].values.tolist()
attrs = widgets.Dropdown(options=attrs, value=attrs[0])
labels = widgets.Dropdown(options=labels, value=labels[0])
interact(fair_dataset.show_bias_metrics, attr=attrs, label_value=labels)
Now you have a lot of informations about your dataset ! You can go deeper by transform your data and re-execute a notebook with this template if necessary.
Don't forget to detail your insight about this dataset on a worksheet or slides so that business people may understand what you found without going into this notebook by themselves
Thanks for reading. Nathan